In [2]:
# import libraries

import pandas as pd
import numpy as np
from sklearn import datasets
from evidently.report import Report
from evidently.metrics import DataDriftTable
from evidently.metrics import DatasetDriftMetric
# create ref and cur dataset for drift detection
adult_data = datasets.fetch_openml(name='adult', version=2, as_frame='auto')
adult = adult_data.frame
adult_ref = adult[~adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
adult_cur = adult[adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
adult_cur.iloc[:2000, 3:5] = np.nan
In [8]:
#Generate Drift Report
#dataset-level metrics
data_drift_dataset_report = Report(metrics=[

    DatasetDriftMetric(),

    DataDriftTable(),    

])
data_drift_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_dataset_report
Out[8]:
Loading...
In [4]:
#report in a JSON format
data_drift_dataset_report.json()
Out[4]:
'{"version": "0.2.8", "timestamp": "2023-03-30 15:50:02.289539", "metrics": [{"metric": "DatasetDriftMetric", "result": {"drift_share": 0.5, "number_of_columns": 15, "number_of_drifted_columns": 5, "share_of_drifted_columns": 0.3333333333333333, "dataset_drift": false}}, {"metric": "DataDriftTable", "result": {"number_of_columns": 15, "number_of_drifted_columns": 5, "share_of_drifted_columns": 0.3333333333333333, "dataset_drift": false, "drift_by_columns": {"age": {"column_name": "age", "column_type": "num", "stattest_name": "Wasserstein distance (normed)", "stattest_threshold": 0.1, "drift_score": 0.18534692319041995, "drift_detected": true, "current": {"small_distribution": {"x": [17.0, 24.3, 31.6, 38.9, 46.2, 53.5, 60.8, 68.1, 75.4, 82.7, 90.0], "y": [0.02471021672878118, 0.025839691234843417, 0.0262859521410848, 0.025211766596857754, 0.015942967066340047, 0.010173168977679455, 0.0061528716099474344, 0.0018640278561586543, 0.000568686464590777, 0.0002369526935794904]}}, "reference": {"small_distribution": {"x": [17.0, 24.3, 31.6, 38.9, 46.2, 53.5, 60.8, 68.1, 75.4, 82.7, 90.0], "y": [0.02104876054252575, 0.020739077628796638, 0.02384558435714183, 0.026835959992838568, 0.018658395552179158, 0.012580868370245284, 0.00869047676652328, 0.0029516652714806184, 0.001287119610186633, 0.000348393277945254]}}}, "capital-gain": {"column_name": "capital-gain", "column_type": "num", "stattest_name": "Wasserstein distance (normed)", "stattest_threshold": 0.1, "drift_score": 0.0817732650223147, "drift_detected": false, "current": {"small_distribution": {"x": [0.0, 9999.9, 19999.8, 29999.699999999997, 39999.6, 49999.5, 59999.399999999994, 69999.3, 79999.2, 89999.09999999999, 99999.0], "y": [9.822510079686088e-05, 1.2569676248842519e-06, 1.7297719608498882e-07, 1.1531813072332584e-08, 2.882953268083146e-09, 0.0, 0.0, 0.0, 0.0, 3.3153962582956155e-07]}}, "reference": {"small_distribution": {"x": [0.0, 9999.9, 19999.8, 29999.699999999997, 39999.6, 49999.5, 59999.399999999994, 69999.3, 79999.2, 89999.09999999999, 99999.0], "y": [9.634147913361861e-05, 2.2395137409516095e-06, 4.804004239265283e-07, 1.4129424233133181e-08, 1.4129424233133181e-08, 0.0, 0.0, 0.0, 0.0, 9.113478630370894e-07]}}}, "capital-loss": {"column_name": "capital-loss", "column_type": "num", "stattest_name": "Wasserstein distance (normed)", "stattest_threshold": 0.1, "drift_score": 0.03378837284315251, "drift_detected": false, "current": {"small_distribution": {"x": [0.0, 435.6, 871.2, 1306.8000000000002, 1742.4, 2178.0, 2613.6000000000004, 3049.2000000000003, 3484.8, 3920.4, 4356.0], "y": [0.0021929683487458603, 1.1912910903101098e-06, 1.257473928660671e-06, 3.4745990134044877e-05, 5.201971094354147e-05, 1.2310007933204457e-05, 5.95645545155055e-07, 6.618283835056166e-08, 3.309141917528083e-07, 1.98548515051685e-07]}}, "reference": {"small_distribution": {"x": [0.0, 390.0, 780.0, 1170.0, 1560.0, 1950.0, 2340.0, 2730.0, 3120.0, 3510.0, 3900.0], "y": [0.002434221847856606, 1.0868679183762194e-06, 1.2680125714389225e-06, 7.970364734758943e-06, 6.23137606535699e-05, 3.967067902073201e-05, 1.4672716898078962e-05, 2.173735836752439e-06, 1.811446530627032e-07, 5.434339591881097e-07]}}}, "education-num": {"column_name": "education-num", "column_type": "num", "stattest_name": "Wasserstein distance (normed)", "stattest_threshold": 0.1, "drift_score": 0.6184494168858121, "drift_detected": true, "current": {"small_distribution": {"x": [9.0, 9.4, 9.8, 10.2, 10.6, 11.0, 11.4, 11.8, 12.2, 12.6, 13.0], "y": [1.138984917551319, 0.0, 0.7847156361856423, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5762994462630397]}}, "reference": {"small_distribution": {"x": [1.0, 2.5, 4.0, 5.5, 7.0, 8.5, 10.0, 11.5, 13.0, 14.5, 16.0], "y": [0.015542211232779936, 0.023972683386318142, 0.08058401036147415, 0.06541858000706464, 0.11628399858707171, 0.0, 0.09706817379018015, 0.07540327328388084, 0.12513834922877665, 0.06725538678912045]}}}, "fnlwgt": {"column_name": "fnlwgt", "column_type": "num", "stattest_name": "Wasserstein distance (normed)", "stattest_threshold": 0.1, "drift_score": 0.02364332069106699, "drift_detected": false, "current": {"small_distribution": {"x": [12285.0, 160096.5, 307908.0, 455719.5, 603531.0, 751342.5, 899154.0, 1046965.5, 1194777.0, 1342588.5, 1490400.0], "y": [2.7682113067215177e-06, 3.1635586130503076e-06, 7.115081270288237e-07, 9.166908434856009e-08, 2.184454775965687e-08, 4.2908933099325996e-09, 2.340487259963236e-09, 1.170243629981618e-09, 3.900812099938727e-10, 3.900812099938727e-10]}}, "reference": {"small_distribution": {"x": [13769.0, 157935.6, 302102.2, 446268.80000000005, 590435.4, 734602.0, 878768.6000000001, 1022935.2000000001, 1167101.8, 1311268.4000000001, 1455435.0], "y": [2.7559464969112866e-06, 3.229318530342351e-06, 8.061045496833332e-07, 1.1172756068559274e-07, 2.1561459079675788e-08, 8.330563735329276e-09, 9.80066321803445e-10, 9.80066321803445e-10, 4.900331609017221e-10, 9.80066321803446e-10]}}}, "hours-per-week": {"column_name": "hours-per-week", "column_type": "num", "stattest_name": "Wasserstein distance (normed)", "stattest_threshold": 0.1, "drift_score": 0.08859914569715735, "drift_detected": false, "current": {"small_distribution": {"x": [1.0, 10.8, 20.6, 30.400000000000002, 40.2, 50.0, 59.800000000000004, 69.60000000000001, 79.4, 89.2, 99.0], "y": [0.0020562899821905873, 0.006445395351902112, 0.007127883586334467, 0.0567789026412883, 0.009940205793736761, 0.012011204574083209, 0.005336351970949533, 0.0013620347092335367, 0.0006089442436530068, 0.0003736034731590911]}}, "reference": {"small_distribution": {"x": [1.0, 10.8, 20.6, 30.400000000000002, 40.2, 50.0, 59.800000000000004, 69.60000000000001, 79.4, 89.2, 99.0], "y": [0.003070956393860971, 0.008196425868121887, 0.007028597380315601, 0.05289830520692911, 0.009313792631146421, 0.012579387106308432, 0.005968901159898786, 0.001585939921712239, 0.0007785523252041899, 0.0006199583330329661]}}}, "class": {"column_name": "class", "column_type": "cat", "stattest_name": "Jensen-Shannon distance", "stattest_threshold": 0.1, "drift_score": 0.034295484989628915, "drift_detected": false, "current": {"small_distribution": {"x": ["<=50K", ">50K"], "y": [26808, 7879]}}, "reference": {"small_distribution": {"x": ["<=50K", ">50K"], "y": [10347, 3808]}}}, "education": {"column_name": "education", "column_type": "cat", "stattest_name": "Jensen-Shannon distance", "stattest_threshold": 0.1, "drift_score": 0.8325546111576977, "drift_detected": true, "current": {"small_distribution": {"x": ["10th", "11th", "12th", "1st-4th", "5th-6th", "7th-8th", "9th", "Assoc-acdm", "Assoc-voc", "Bachelors", "Doctorate", "HS-grad", "Masters", "Preschool", "Prof-school", "Some-college"], "y": [0, 0, 0, 0, 0, 0, 0, 0, 0, 7535, 0, 14892, 0, 0, 0, 10260]}}, "reference": {"small_distribution": {"x": ["10th", "11th", "12th", "1st-4th", "5th-6th", "7th-8th", "9th", "Assoc-acdm", "Assoc-voc", "Bachelors", "Doctorate", "HS-grad", "Masters", "Preschool", "Prof-school", "Some-college"], "y": [1389, 1812, 657, 247, 509, 955, 756, 1601, 2061, 0, 594, 0, 2657, 83, 834, 0]}}}, "marital-status": {"column_name": "marital-status", "column_type": "cat", "stattest_name": "Jensen-Shannon distance", "stattest_threshold": 0.1, "drift_score": 0.035801921750473246, "drift_detected": false, "current": {"small_distribution": {"x": ["Divorced", "Married-AF-spouse", "Married-civ-spouse", "Married-spouse-absent", "Never-married", "Separated", "Widowed"], "y": [4819, 30, 15558, 406, 11805, 1060, 1009]}}, "reference": {"small_distribution": {"x": ["Divorced", "Married-AF-spouse", "Married-civ-spouse", "Married-spouse-absent", "Never-married", "Separated", "Widowed"], "y": [1814, 7, 6821, 222, 4312, 470, 509]}}}, "native-country": {"column_name": "native-country", "column_type": "cat", "stattest_name": "Jensen-Shannon distance", "stattest_threshold": 0.1, "drift_score": 0.10732727839423434, "drift_detected": true, "current": {"small_distribution": {"x": ["Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic", "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece", "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong", "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan", "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru", "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South", "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam", "Yugoslavia"], "y": [22, 123, 66, 51, 74, 45, 30, 60, 86, 22, 147, 30, 29, 46, 1, 12, 14, 13, 63, 37, 28, 56, 75, 68, 13, 312, 31, 18, 34, 216, 57, 27, 116, 16, 91, 33, 19, 16, 31848, 63, 16]}}, "reference": {"small_distribution": {"x": ["Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic", "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece", "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong", "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan", "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru", "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South", "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam", "Yugoslavia"], "y": [6, 59, 56, 34, 64, 58, 15, 95, 41, 16, 59, 19, 59, 29, 0, 8, 16, 6, 88, 22, 9, 49, 31, 24, 10, 639, 18, 5, 12, 79, 30, 40, 68, 5, 24, 32, 11, 11, 11984, 23, 7]}}}, "occupation": {"column_name": "occupation", "column_type": "cat", "stattest_name": "Jensen-Shannon distance", "stattest_threshold": 0.1, "drift_score": 0.15320104636275717, "drift_detected": true, "current": {"small_distribution": {"x": ["Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial", "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct", "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv", "Sales", "Tech-support", "Transport-moving"], "y": [4670, 10, 4501, 4505, 939, 1422, 2122, 3366, 129, 3216, 781, 4351, 1042, 1726]}}, "reference": {"small_distribution": {"x": ["Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial", "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct", "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv", "Sales", "Tech-support", "Transport-moving"], "y": [941, 5, 1611, 1581, 551, 650, 900, 1557, 113, 2956, 202, 1153, 404, 629]}}}, "race": {"column_name": "race", "column_type": "cat", "stattest_name": "Jensen-Shannon distance", "stattest_threshold": 0.1, "drift_score": 0.01943554478937471, "drift_detected": false, "current": {"small_distribution": {"x": ["Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"], "y": [329, 1046, 3362, 240, 29710]}}, "reference": {"small_distribution": {"x": ["Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"], "y": [141, 473, 1323, 166, 12052]}}}, "relationship": {"column_name": "relationship", "column_type": "cat", "stattest_name": "Jensen-Shannon distance", "stattest_threshold": 0.1, "drift_score": 0.03114372566505728, "drift_detected": false, "current": {"small_distribution": {"x": ["Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried", "Wife"], "y": [13682, 9073, 1038, 5634, 3604, 1656]}}, "reference": {"small_distribution": {"x": ["Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried", "Wife"], "y": [6034, 3510, 468, 1947, 1521, 675]}}}, "sex": {"column_name": "sex", "column_type": "cat", "stattest_name": "Jensen-Shannon distance", "stattest_threshold": 0.1, "drift_score": 0.018953472036037717, "drift_detected": false, "current": {"small_distribution": {"x": ["Female", "Male"], "y": [11752, 22935]}}, "reference": {"small_distribution": {"x": ["Female", "Male"], "y": [4440, 9715]}}}, "workclass": {"column_name": "workclass", "column_type": "cat", "stattest_name": "Jensen-Shannon distance", "stattest_threshold": 0.1, "drift_score": 0.04051769065509631, "drift_detected": false, "current": {"small_distribution": {"x": ["Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc", "Self-emp-not-inc", "State-gov", "Without-pay"], "y": [1071, 2050, 4, 24520, 1176, 2610, 1336, 17]}}, "reference": {"small_distribution": {"x": ["Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc", "Self-emp-not-inc", "State-gov", "Without-pay"], "y": [361, 1086, 6, 9386, 519, 1252, 645, 4]}}}}}}]}'
In [10]:
import pandas as pd
from sklearn import datasets 
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataDriftTab, CatTargetDriftTab
C:\Users\Prosenjeet Saha\anaconda3\lib\site-packages\evidently\analyzers\__init__.py:3: UserWarning:

analyzers are deprecated, use metrics instead

C:\Users\Prosenjeet Saha\anaconda3\lib\site-packages\evidently\dashboard\__init__.py:8: UserWarning:

dashboards are deprecated, use metrics instead

In [16]:
import pandas as pd
from eurybia.data.data_loader import data_loading
house_df, house_dict = data_loading('house_prices')

house_df_learning = house_df.loc[house_df['YrSold'] == 2006]
house_df_2007 = house_df.loc[house_df['YrSold'] == 2007]

y_df_learning=house_df_learning['SalePrice'].to_frame()
X_df_learning=house_df_learning[house_df_learning.columns.difference(['SalePrice','YrSold'])]

y_df_2007=house_df_2007['SalePrice'].to_frame()
X_df_2007=house_df_2007[house_df_2007.columns.difference(['SalePrice','YrSold'])]
In [19]:
from category_encoders import OrdinalEncoder
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
categorical_features = [col for col in X_df_learning.columns if X_df_learning[col].dtype == 'object']

encoder = OrdinalEncoder(
    cols=categorical_features,
    handle_unknown='ignore',
    return_df=True).fit(X_df_learning)

X_df_learning_encoded=encoder.transform(X_df_learning)
Xtrain, Xtest, ytrain, ytest = train_test_split(X_df_learning_encoded, y_df_learning, train_size=0.75, random_state=1)
regressor = LGBMRegressor(n_estimators=200).fit(Xtrain,ytrain)
In [20]:
from eurybia import SmartDrift
In [23]:
SD = SmartDrift(df_current=X_df_2007,
                df_baseline=X_df_learning,
                deployed_model=regressor,
                encoding=encoder 
                )
In [24]:
SD.compile(full_validation=True, 
           date_compile_auc = '01/01/2007',
           datadrift_file = "house_price_auc.csv" 
            )
The variable BsmtCond has mismatching unique values:
['Poor -Severe cracking, settling, or wetness'] | []

The variable Condition2 has mismatching unique values:
['Near positive off-site feature--park, greenbelt, etc.', 'Adjacent to North-South Railroad', 'Adjacent to East-West Railroad'] | ['Adjacent to feeder street']

The variable Electrical has mismatching unique values:
['Mixed'] | ['60 AMP Fuse Box and mostly knob & tube wiring (poor)']

The variable ExterQual has mismatching unique values:
['Fair'] | []

The variable Exterior1st has mismatching unique values:
[] | ['Stone', 'Imitation Stucco']

The variable Exterior2nd has mismatching unique values:
['Asphalt Shingles', 'Brick Common'] | ['Other']

The variable Foundation has mismatching unique values:
[] | ['Stone', 'Wood']

The variable Functional has mismatching unique values:
['Major Deductions 2', 'Severely Damaged'] | ['Moderate Deductions']

The variable GarageQual has mismatching unique values:
[] | ['Excellent']

The variable Heating has mismatching unique values:
[] | ['Wall furnace']

The variable HeatingQC has mismatching unique values:
['Poor'] | []

The variable LotConfig has mismatching unique values:
[] | ['Frontage on 3 sides of property']

The variable MSSubClass has mismatching unique values:
['1-Story w/Finished Attic All Ages'] | []

The variable Neighborhood has mismatching unique values:
['Northpark Villa'] | []

The variable RoofMatl has mismatching unique values:
['Roll'] | ['Metal']

The variable RoofStyle has mismatching unique values:
['Mansard', 'Shed'] | []

The variable SaleType has mismatching unique values:
['Warranty Deed - Cash'] | ['Contract Low Interest', 'Contract Low Down', 'Contract Low Down payment and low interest']

The variable Street has mismatching unique values:
['Gravel'] | []

The computed AUC on the X_test used to build datadrift_classifier is equal to: 0.626082251082251
house_price_auc.csv did not exist and was created. 
In [27]:
SD.generate_report(    
    output_file='report_house_price_datadrift_2007.html',    
    title_story="Data drift",
    title_description="House price Data drift 2007", 
    )
WARNING:datapane:Bokeh version 2.4.3 is not supported, these plots may not display correctly, please install version ~=2.2.0

Report saved to ./report_house_price_datadrift_2007.html. To upload and share your report, create a free Datapane account by running !datapane signup.

In [29]:
>>> import nannyml as nml
>>> from IPython.display import display

>>> reference_df, analysis_df, _ = nml.load_synthetic_car_loan_dataset()
>>> display(reference_df.head())

>>> column_names = ['car_value', 'salary_range', 'debt_to_income_ratio', 'loan_length', 'repaid_loan_on_prev_car', 'size_of_downpayment', 'driver_tenure', 'y_pred_proba', 'y_pred']

>>> calc = nml.UnivariateDriftCalculator(
...     column_names=column_names,
...     treat_as_categorical=['y_pred'],
...     timestamp_column_name='timestamp',
...     continuous_methods=['kolmogorov_smirnov', 'jensen_shannon'],
...     categorical_methods=['chi2', 'jensen_shannon'],
>>> )

>>> calc.fit(reference_df)
>>> results = calc.calculate(analysis_df)
>>> display(results.filter(period='analysis', column_names=['debt_to_income_ratio']).to_df())

>>> figure = results.filter(column_names=results.continuous_column_names, methods=['jensen_shannon']).plot(kind='drift')
>>> figure.show()

>>> figure = results.filter(column_names=results.categorical_column_names, methods=['chi2']).plot(kind='drift')
>>> figure.show()

>>> figure = results.filter(column_names=results.continuous_column_names, methods=['jensen_shannon']).plot(kind='distribution')
>>> figure.show()

>>> figure = results.filter(column_names=results.categorical_column_names, methods=['chi2']).plot(kind='distribution')
>>> figure.show()
car_value salary_range debt_to_income_ratio loan_length repaid_loan_on_prev_car size_of_downpayment driver_tenure repaid timestamp y_pred_proba y_pred
0 39811.0 40K - 60K € 0.632950 19.0 False 40% 0.212653 1.0 2018-01-01 00:00:00.000 0.99 1
1 12679.0 40K - 60K € 0.718627 7.0 True 10% 4.927549 0.0 2018-01-01 00:08:43.152 0.07 0
2 19847.0 40K - 60K € 0.721724 17.0 False 0% 0.520817 1.0 2018-01-01 00:17:26.304 1.00 1
3 22652.0 20K - 20K € 0.705992 16.0 False 10% 0.453649 1.0 2018-01-01 00:26:09.456 0.98 1
4 21268.0 60K+ € 0.671888 21.0 True 30% 5.695263 1.0 2018-01-01 00:34:52.608 0.99 1
WARNING:nannyml.drift.univariate.methods:Chi2 statistic lower threshold value -0.5116701548425131 overridden by lower threshold value limit 0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic upper threshold value 5.848115773586565 overridden by upper threshold value limit 1.0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic lower threshold value -2.713471347464844 overridden by lower threshold value limit 0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic upper threshold value 4.16076125080716 overridden by upper threshold value limit 1.0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic lower threshold value -3.4101771406067924 overridden by lower threshold value limit 0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic upper threshold value 8.101802433611365 overridden by upper threshold value limit 1.0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic lower threshold value -0.556011256726464 overridden by lower threshold value limit 0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic upper threshold value 1.3090287653237747 overridden by upper threshold value limit 1.0
chunk debt_to_income_ratio
chunk kolmogorov_smirnov jensen_shannon
key chunk_index start_index end_index start_date end_date period value upper_threshold lower_threshold alert value upper_threshold lower_threshold alert
0 [0:4999] 0 0 4999 2018-10-30 18:00:00 2018-11-30 00:27:16.848 analysis 0.01576 0.018584 None False 0.031661 0.1 None False
1 [5000:9999] 1 5000 9999 2018-11-30 00:36:00 2018-12-30 07:03:16.848 analysis 0.01268 0.018584 None False 0.030011 0.1 None False
2 [10000:14999] 2 10000 14999 2018-12-30 07:12:00 2019-01-29 13:39:16.848 analysis 0.01734 0.018584 None False 0.031129 0.1 None False
3 [15000:19999] 3 15000 19999 2019-01-29 13:48:00 2019-02-28 20:15:16.848 analysis 0.01280 0.018584 None False 0.029464 0.1 None False
4 [20000:24999] 4 20000 24999 2019-02-28 20:24:00 2019-03-31 02:51:16.848 analysis 0.01918 0.018584 None True 0.030809 0.1 None False
5 [25000:29999] 5 25000 29999 2019-03-31 03:00:00 2019-04-30 09:27:16.848 analysis 0.00824 0.018584 None False 0.028681 0.1 None False
6 [30000:34999] 6 30000 34999 2019-04-30 09:36:00 2019-05-30 16:03:16.848 analysis 0.01058 0.018584 None False 0.043628 0.1 None False
7 [35000:39999] 7 35000 39999 2019-05-30 16:12:00 2019-06-29 22:39:16.848 analysis 0.01002 0.018584 None False 0.029253 0.1 None False
8 [40000:44999] 8 40000 44999 2019-06-29 22:48:00 2019-07-30 05:15:16.848 analysis 0.01068 0.018584 None False 0.030628 0.1 None False
9 [45000:49999] 9 45000 49999 2019-07-30 05:24:00 2019-08-29 11:51:16.848 analysis 0.00680 0.018584 None False 0.028330 0.1 None False
In [30]:
>>> import nannyml as nml
>>> from IPython.display import display

>>> # Load synthetic data
>>> reference, analysis, _ = nml.load_synthetic_car_loan_dataset()
>>> display(reference.head())

>>> non_feature_columns = ['timestamp', 'y_pred_proba', 'y_pred', 'repaid']

>>> # Define feature columns
>>> feature_column_names = [
...     col for col in reference.columns
...     if col not in non_feature_columns
>>> ]

>>> calc = nml.DataReconstructionDriftCalculator(
...     column_names=feature_column_names,
...     timestamp_column_name='timestamp',
...     chunk_size=5000
>>> )
>>> calc.fit(reference)
>>> results = calc.calculate(analysis)

>>> display(results.filter(period='analysis').to_df())

>>> display(results.filter(period='reference').to_df())

>>> figure = results.plot()
>>> figure.show()
car_value salary_range debt_to_income_ratio loan_length repaid_loan_on_prev_car size_of_downpayment driver_tenure repaid timestamp y_pred_proba y_pred
0 39811.0 40K - 60K € 0.632950 19.0 False 40% 0.212653 1.0 2018-01-01 00:00:00.000 0.99 1
1 12679.0 40K - 60K € 0.718627 7.0 True 10% 4.927549 0.0 2018-01-01 00:08:43.152 0.07 0
2 19847.0 40K - 60K € 0.721724 17.0 False 0% 0.520817 1.0 2018-01-01 00:17:26.304 1.00 1
3 22652.0 20K - 20K € 0.705992 16.0 False 10% 0.453649 1.0 2018-01-01 00:26:09.456 0.98 1
4 21268.0 60K+ € 0.671888 21.0 True 30% 5.695263 1.0 2018-01-01 00:34:52.608 0.99 1
chunk reconstruction_error
key chunk_index start_index end_index start_date end_date period sampling_error value upper_confidence_boundary lower_confidence_boundary upper_threshold lower_threshold alert
0 [0:4999] 0 0 4999 2018-10-30 18:00:00 2018-11-30 00:27:16.848 analysis 0.006996 1.141517 1.162505 1.120528 1.153019 1.115281 False
1 [5000:9999] 1 5000 9999 2018-11-30 00:36:00 2018-12-30 07:03:16.848 analysis 0.006996 1.130636 1.151625 1.109648 1.153019 1.115281 False
2 [10000:14999] 2 10000 14999 2018-12-30 07:12:00 2019-01-29 13:39:16.848 analysis 0.006996 1.138914 1.159903 1.117926 1.153019 1.115281 False
3 [15000:19999] 3 15000 19999 2019-01-29 13:48:00 2019-02-28 20:15:16.848 analysis 0.006996 1.145040 1.166029 1.124052 1.153019 1.115281 False
4 [20000:24999] 4 20000 24999 2019-02-28 20:24:00 2019-03-31 02:51:16.848 analysis 0.006996 1.137563 1.158551 1.116574 1.153019 1.115281 False
5 [25000:29999] 5 25000 29999 2019-03-31 03:00:00 2019-04-30 09:27:16.848 analysis 0.006996 1.249205 1.270194 1.228217 1.153019 1.115281 True
6 [30000:34999] 6 30000 34999 2019-04-30 09:36:00 2019-05-30 16:03:16.848 analysis 0.006996 1.243101 1.264090 1.222113 1.153019 1.115281 True
7 [35000:39999] 7 35000 39999 2019-05-30 16:12:00 2019-06-29 22:39:16.848 analysis 0.006996 1.258147 1.279135 1.237158 1.153019 1.115281 True
8 [40000:44999] 8 40000 44999 2019-06-29 22:48:00 2019-07-30 05:15:16.848 analysis 0.006996 1.228179 1.249167 1.207190 1.153019 1.115281 True
9 [45000:49999] 9 45000 49999 2019-07-30 05:24:00 2019-08-29 11:51:16.848 analysis 0.006996 1.259884 1.280872 1.238895 1.153019 1.115281 True
chunk reconstruction_error
key chunk_index start_index end_index start_date end_date period sampling_error value upper_confidence_boundary lower_confidence_boundary upper_threshold lower_threshold alert
0 [0:4999] 0 0 4999 2018-01-01 00:00:00 2018-01-31 06:27:16.848 reference 0.006996 1.136408 1.157397 1.115420 1.153019 1.115281 False
1 [5000:9999] 1 5000 9999 2018-01-31 06:36:00 2018-03-02 13:03:16.848 reference 0.006996 1.134478 1.155466 1.113490 1.153019 1.115281 False
2 [10000:14999] 2 10000 14999 2018-03-02 13:12:00 2018-04-01 19:39:16.848 reference 0.006996 1.136076 1.157065 1.115088 1.153019 1.115281 False
3 [15000:19999] 3 15000 19999 2018-04-01 19:48:00 2018-05-02 02:15:16.848 reference 0.006996 1.140114 1.161103 1.119126 1.153019 1.115281 False
4 [20000:24999] 4 20000 24999 2018-05-02 02:24:00 2018-06-01 08:51:16.848 reference 0.006996 1.126078 1.147067 1.105090 1.153019 1.115281 False
5 [25000:29999] 5 25000 29999 2018-06-01 09:00:00 2018-07-01 15:27:16.848 reference 0.006996 1.142018 1.163006 1.121029 1.153019 1.115281 False
6 [30000:34999] 6 30000 34999 2018-07-01 15:36:00 2018-07-31 22:03:16.848 reference 0.006996 1.125146 1.146135 1.104158 1.153019 1.115281 False
7 [35000:39999] 7 35000 39999 2018-07-31 22:12:00 2018-08-31 04:39:16.848 reference 0.006996 1.143206 1.164195 1.122218 1.153019 1.115281 False
8 [40000:44999] 8 40000 44999 2018-08-31 04:48:00 2018-09-30 11:15:16.848 reference 0.006996 1.130954 1.151942 1.109965 1.153019 1.115281 False
9 [45000:49999] 9 45000 49999 2018-09-30 11:24:00 2018-10-30 17:51:16.848 reference 0.006996 1.127026 1.148014 1.106037 1.153019 1.115281 False
In [31]:
>>> import nannyml as nml
>>> from IPython.display import display

>>> reference_df, analysis_df, analysis_target_df = nml.load_synthetic_car_loan_dataset()
>>> analysis_full_df = analysis_df.merge(analysis_target_df, left_index=True, right_index=True)

>>> column_names = [
...     'car_value', 'salary_range', 'debt_to_income_ratio', 'loan_length', 'repaid_loan_on_prev_car', 'size_of_downpayment', 'driver_tenure', 'y_pred_proba', 'y_pred', 'repaid'
>>> ]

>>> univ_calc = nml.UnivariateDriftCalculator(
...     column_names=column_names,
...     treat_as_categorical=['y_pred', 'repaid'],
...     timestamp_column_name='timestamp',
...     continuous_methods=['kolmogorov_smirnov', 'jensen_shannon'],
...     categorical_methods=['chi2', 'jensen_shannon'],
...     chunk_size=5000
>>> )

>>> univ_calc.fit(reference_df)
>>> univariate_results = univ_calc.calculate(analysis_full_df)
>>> display(univariate_results.filter(period='analysis', column_names=['debt_to_income_ratio']).to_df())

>>> alert_count_ranker = nml.AlertCountRanker()
>>> alert_count_ranked_features = alert_count_ranker.rank(
...     univariate_results.filter(methods=['jensen_shannon']),
...     only_drifting = False)
>>> display(alert_count_ranked_features)

>>> estimated_calc = nml.CBPE(
...     y_pred_proba='y_pred_proba',
...     y_pred='y_pred',
...     y_true='repaid',
...     timestamp_column_name='timestamp',
...     metrics=['roc_auc', 'recall'],
...     chunk_size=5000,
...     problem_type='classification_binary',
>>> )
>>> estimated_calc.fit(reference_df)
>>> estimated_perf_results = estimated_calc.estimate(analysis_full_df)
>>> display(estimated_perf_results.filter(period='analysis').to_df())

>>> realized_calc = nml.PerformanceCalculator(
...     y_pred_proba='y_pred_proba',
...     y_pred='y_pred',
...     y_true='repaid',
...     timestamp_column_name='timestamp',
...     problem_type='classification_binary',
...     metrics=['roc_auc', 'recall',],
...     chunk_size=5000)
>>> realized_calc.fit(reference_df)
>>> realized_perf_results = realized_calc.calculate(analysis_full_df)
>>> display(realized_perf_results.filter(period='analysis').to_df())

>>> ranker1 = nml.CorrelationRanker()

>>> # ranker fits on one metric and reference period data only
>>> ranker1.fit(
...     estimated_perf_results.filter(period='reference', metrics=['roc_auc']))
>>> # ranker ranks on one drift method and one performance metric
>>> correlation_ranked_features1 = ranker1.rank(
...     univariate_results.filter(methods=['jensen_shannon']),
...     estimated_perf_results.filter(metrics=['roc_auc']),
...     only_drifting = False)

>>> display(correlation_ranked_features1)

>>> ranker2 = nml.CorrelationRanker()

>>> # ranker fits on one metric and reference period data only
>>> ranker2.fit(
...     realized_perf_results.filter(period='reference', metrics=['recall']))
>>> # ranker ranks on one drift method and one performance metric
>>> correlation_ranked_features2 = ranker2.rank(
...     univariate_results.filter(period='analysis', methods=['jensen_shannon']),
...     realized_perf_results.filter(period='analysis', metrics=['recall']),
...     only_drifting = False)

>>> display(correlation_ranked_features2)
WARNING:nannyml.drift.univariate.methods:Chi2 statistic lower threshold value -0.5116701548425131 overridden by lower threshold value limit 0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic upper threshold value 5.848115773586565 overridden by upper threshold value limit 1.0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic lower threshold value -2.713471347464844 overridden by lower threshold value limit 0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic upper threshold value 4.16076125080716 overridden by upper threshold value limit 1.0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic lower threshold value -3.4101771406067924 overridden by lower threshold value limit 0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic upper threshold value 8.101802433611365 overridden by upper threshold value limit 1.0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic lower threshold value -0.556011256726464 overridden by lower threshold value limit 0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic upper threshold value 1.3090287653237747 overridden by upper threshold value limit 1.0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic lower threshold value -0.7927773066243359 overridden by lower threshold value limit 0
WARNING:nannyml.drift.univariate.methods:Chi2 statistic upper threshold value 1.7259320577135675 overridden by upper threshold value limit 1.0
chunk debt_to_income_ratio
chunk kolmogorov_smirnov jensen_shannon
key chunk_index start_index end_index start_date end_date period value upper_threshold lower_threshold alert value upper_threshold lower_threshold alert
0 [0:4999] 0 0 4999 2018-10-30 18:00:00 2018-11-30 00:27:16.848 analysis 0.01576 0.018584 None False 0.031661 0.1 None False
1 [5000:9999] 1 5000 9999 2018-11-30 00:36:00 2018-12-30 07:03:16.848 analysis 0.01268 0.018584 None False 0.030011 0.1 None False
2 [10000:14999] 2 10000 14999 2018-12-30 07:12:00 2019-01-29 13:39:16.848 analysis 0.01734 0.018584 None False 0.031129 0.1 None False
3 [15000:19999] 3 15000 19999 2019-01-29 13:48:00 2019-02-28 20:15:16.848 analysis 0.01280 0.018584 None False 0.029464 0.1 None False
4 [20000:24999] 4 20000 24999 2019-02-28 20:24:00 2019-03-31 02:51:16.848 analysis 0.01918 0.018584 None True 0.030809 0.1 None False
5 [25000:29999] 5 25000 29999 2019-03-31 03:00:00 2019-04-30 09:27:16.848 analysis 0.00824 0.018584 None False 0.028681 0.1 None False
6 [30000:34999] 6 30000 34999 2019-04-30 09:36:00 2019-05-30 16:03:16.848 analysis 0.01058 0.018584 None False 0.043628 0.1 None False
7 [35000:39999] 7 35000 39999 2019-05-30 16:12:00 2019-06-29 22:39:16.848 analysis 0.01002 0.018584 None False 0.029253 0.1 None False
8 [40000:44999] 8 40000 44999 2019-06-29 22:48:00 2019-07-30 05:15:16.848 analysis 0.01068 0.018584 None False 0.030628 0.1 None False
9 [45000:49999] 9 45000 49999 2019-07-30 05:24:00 2019-08-29 11:51:16.848 analysis 0.00680 0.018584 None False 0.028330 0.1 None False
number_of_alerts column_name rank
0 5 y_pred_proba 1
1 5 salary_range 2
2 5 repaid_loan_on_prev_car 3
3 5 loan_length 4
4 5 car_value 5
5 0 y_pred 6
6 0 size_of_downpayment 7
7 0 repaid 8
8 0 driver_tenure 9
9 0 debt_to_income_ratio 10
chunk roc_auc recall
key chunk_index start_index end_index start_date end_date period value sampling_error realized ... lower_threshold alert value sampling_error realized upper_confidence_boundary lower_confidence_boundary upper_threshold lower_threshold alert
0 [0:4999] 0 0 4999 2018-10-30 18:00:00 2018-11-30 00:27:16.848 analysis 0.968631 0.001811 0.970962 ... 0.963317 False 0.928723 0.005137 0.930394 0.944133 0.913313 0.941033 0.9171 False
1 [5000:9999] 1 5000 9999 2018-11-30 00:36:00 2018-12-30 07:03:16.848 analysis 0.969044 0.001811 0.970248 ... 0.963317 False 0.925261 0.005137 0.923922 0.940671 0.909851 0.941033 0.9171 False
2 [10000:14999] 2 10000 14999 2018-12-30 07:12:00 2019-01-29 13:39:16.848 analysis 0.969444 0.001811 0.976282 ... 0.963317 False 0.929317 0.005137 0.938246 0.944727 0.913907 0.941033 0.9171 False
3 [15000:19999] 3 15000 19999 2019-01-29 13:48:00 2019-02-28 20:15:16.848 analysis 0.969047 0.001811 0.967721 ... 0.963317 False 0.929713 0.005137 0.925060 0.945123 0.914303 0.941033 0.9171 False
4 [20000:24999] 4 20000 24999 2019-02-28 20:24:00 2019-03-31 02:51:16.848 analysis 0.968873 0.001811 0.969886 ... 0.963317 False 0.930604 0.005137 0.927577 0.946014 0.915194 0.941033 0.9171 False
5 [25000:29999] 5 25000 29999 2019-03-31 03:00:00 2019-04-30 09:27:16.848 analysis 0.960478 0.001811 0.960050 ... 0.963317 True 0.883990 0.005137 0.905086 0.899400 0.868580 0.941033 0.9171 True
6 [30000:34999] 6 30000 34999 2019-04-30 09:36:00 2019-05-30 16:03:16.848 analysis 0.961134 0.001811 0.958530 ... 0.963317 True 0.883528 0.005137 0.899010 0.898938 0.868118 0.941033 0.9171 True
7 [35000:39999] 7 35000 39999 2019-05-30 16:12:00 2019-06-29 22:39:16.848 analysis 0.960536 0.001811 0.959041 ... 0.963317 True 0.885501 0.005137 0.901718 0.900911 0.870091 0.941033 0.9171 True
8 [40000:44999] 8 40000 44999 2019-06-29 22:48:00 2019-07-30 05:15:16.848 analysis 0.961869 0.001811 0.963094 ... 0.963317 True 0.885978 0.005137 0.906124 0.901388 0.870568 0.941033 0.9171 True
9 [45000:49999] 9 45000 49999 2019-07-30 05:24:00 2019-08-29 11:51:16.848 analysis 0.960537 0.001811 0.957556 ... 0.963317 True 0.889808 0.005137 0.905823 0.905218 0.874398 0.941033 0.9171 True

10 rows × 23 columns

chunk roc_auc recall
key chunk_index start_index end_index start_date end_date period targets_missing_rate sampling_error value upper_threshold lower_threshold alert sampling_error value upper_threshold lower_threshold alert
0 [0:4999] 0 0 4999 2018-10-30 18:00:00 2018-11-30 00:27:16.848 analysis 0.0 0.001811 0.970962 0.97866 0.963317 False 0.005137 0.930394 0.941033 0.9171 False
1 [5000:9999] 1 5000 9999 2018-11-30 00:36:00 2018-12-30 07:03:16.848 analysis 0.0 0.001811 0.970248 0.97866 0.963317 False 0.005137 0.923922 0.941033 0.9171 False
2 [10000:14999] 2 10000 14999 2018-12-30 07:12:00 2019-01-29 13:39:16.848 analysis 0.0 0.001811 0.976282 0.97866 0.963317 False 0.005137 0.938246 0.941033 0.9171 False
3 [15000:19999] 3 15000 19999 2019-01-29 13:48:00 2019-02-28 20:15:16.848 analysis 0.0 0.001811 0.967721 0.97866 0.963317 False 0.005137 0.925060 0.941033 0.9171 False
4 [20000:24999] 4 20000 24999 2019-02-28 20:24:00 2019-03-31 02:51:16.848 analysis 0.0 0.001811 0.969886 0.97866 0.963317 False 0.005137 0.927577 0.941033 0.9171 False
5 [25000:29999] 5 25000 29999 2019-03-31 03:00:00 2019-04-30 09:27:16.848 analysis 0.0 0.001811 0.960050 0.97866 0.963317 True 0.005137 0.905086 0.941033 0.9171 True
6 [30000:34999] 6 30000 34999 2019-04-30 09:36:00 2019-05-30 16:03:16.848 analysis 0.0 0.001811 0.958530 0.97866 0.963317 True 0.005137 0.899010 0.941033 0.9171 True
7 [35000:39999] 7 35000 39999 2019-05-30 16:12:00 2019-06-29 22:39:16.848 analysis 0.0 0.001811 0.959041 0.97866 0.963317 True 0.005137 0.901718 0.941033 0.9171 True
8 [40000:44999] 8 40000 44999 2019-06-29 22:48:00 2019-07-30 05:15:16.848 analysis 0.0 0.001811 0.963094 0.97866 0.963317 True 0.005137 0.906124 0.941033 0.9171 True
9 [45000:49999] 9 45000 49999 2019-07-30 05:24:00 2019-08-29 11:51:16.848 analysis 0.0 0.001811 0.957556 0.97866 0.963317 True 0.005137 0.905823 0.941033 0.9171 True
column_name pearsonr_correlation pearsonr_pvalue has_drifted rank
0 repaid_loan_on_prev_car 0.998290 1.177706e-23 True 1
1 y_pred_proba 0.998072 3.474576e-23 True 2
2 loan_length 0.996876 2.661458e-21 True 3
3 salary_range 0.996512 7.162919e-21 True 4
4 car_value 0.996148 1.746760e-20 True 5
5 size_of_downpayment 0.307497 1.872200e-01 False 6
6 debt_to_income_ratio 0.250211 2.873424e-01 False 7
7 y_pred 0.075282 7.524257e-01 False 8
8 repaid -0.117004 6.232446e-01 False 9
9 driver_tenure -0.134447 5.719876e-01 False 10
column_name pearsonr_correlation pearsonr_pvalue has_drifted rank
0 repaid_loan_on_prev_car 0.968970 0.000004 True 1
1 y_pred_proba 0.966157 0.000006 True 2
2 loan_length 0.965298 0.000006 True 3
3 car_value 0.963623 0.000007 True 4
4 salary_range 0.963456 0.000007 True 5
5 size_of_downpayment 0.308948 0.385072 False 6
6 debt_to_income_ratio 0.307373 0.387627 False 7
7 y_pred -0.357571 0.310383 False 8
8 repaid -0.395842 0.257495 False 9
9 driver_tenure -0.575807 0.081520 False 10
In [32]:
#Adjusting Plots
>>> import nannyml as nml
>>> reference, analysis, analysis_target = nml.load_synthetic_car_loan_dataset()

>>> estimator = nml.CBPE(
...     y_pred_proba='y_pred_proba',
...     y_pred='y_pred',
...     y_true='repaid',
...     timestamp_column_name='timestamp',
...     metrics=['roc_auc'],
...     chunk_size=5000,
...     problem_type='classification_binary',
>>> ).fit(reference)

>>> estimated_performance = estimator.estimate(analysis)
>>> figure = estimated_performance.plot(kind='performance')

>>> # indicate period of interest
>>> import datetime as dt

>>> # add additional indicator for a particular period
>>> figure.add_vrect(
...     x0=dt.datetime(2019,2,28),
...     x1=dt.datetime(2019,4,30),
...     annotation_text="Strategy change",
...     annotation_position="top left"
>>> )

>>> figure.show()
In [33]:
#Thresholds
>>> import numpy as np

>>> import nannyml as nml
>>> from IPython.display import display

>>> reference_df, analysis_df, _ = nml.load_synthetic_car_loan_dataset()
>>> display(reference_df.head())

>>> estimator = nml.CBPE(
...     y_pred_proba='y_pred_proba',
...     y_pred='y_pred',
...     y_true='repaid',
...     timestamp_column_name='timestamp',
...     metrics=['f1'],
...     chunk_size=5000,
...     problem_type='classification_binary',
>>> )
>>> estimator.thresholds['f1']

>>> estimator.fit(reference_df)
>>> results = estimator.estimate(analysis_df)
>>> columns = [('chunk', 'key'), ('chunk', 'period'), ('f1', 'value'), ('f1', 'upper_threshold'), ('f1', 'lower_threshold'), ('f1', 'alert')]
>>> display(results.to_df()[columns])

>>> metric_fig = results.plot()
>>> metric_fig.show()

>>> constant_threshold = nml.thresholds.ConstantThreshold(lower=None, upper=0.93)
>>> constant_threshold.thresholds(results.filter(period='reference').to_df()[('f1', 'value')])

>>> estimator = nml.CBPE(
...     y_pred_proba='y_pred_proba',
...     y_pred='y_pred',
...     y_true='repaid',
...     timestamp_column_name='timestamp',
...     metrics=['f1'],
...     chunk_size=5000,
...     problem_type='classification_binary',
...     thresholds={
...         'f1': constant_threshold
...     }
>>> )
>>> estimator.fit(reference_df)
>>> results = estimator.estimate(analysis_df)
>>> display(results.to_df()[columns])

>>> metric_fig = results.plot()
>>> metric_fig.show()
car_value salary_range debt_to_income_ratio loan_length repaid_loan_on_prev_car size_of_downpayment driver_tenure repaid timestamp y_pred_proba y_pred
0 39811.0 40K - 60K € 0.632950 19.0 False 40% 0.212653 1.0 2018-01-01 00:00:00.000 0.99 1
1 12679.0 40K - 60K € 0.718627 7.0 True 10% 4.927549 0.0 2018-01-01 00:08:43.152 0.07 0
2 19847.0 40K - 60K € 0.721724 17.0 False 0% 0.520817 1.0 2018-01-01 00:17:26.304 1.00 1
3 22652.0 20K - 20K € 0.705992 16.0 False 10% 0.453649 1.0 2018-01-01 00:26:09.456 0.98 1
4 21268.0 60K+ € 0.671888 21.0 True 30% 5.695263 1.0 2018-01-01 00:34:52.608 0.99 1
chunk f1
key period value upper_threshold lower_threshold alert
0 [0:4999] reference 0.942960 0.95085 0.93466 False
1 [5000:9999] reference 0.940827 0.95085 0.93466 False
2 [10000:14999] reference 0.943211 0.95085 0.93466 False
3 [15000:19999] reference 0.942901 0.95085 0.93466 False
4 [20000:24999] reference 0.943178 0.95085 0.93466 False
5 [25000:29999] reference 0.942702 0.95085 0.93466 False
6 [30000:34999] reference 0.940858 0.95085 0.93466 False
7 [35000:39999] reference 0.944588 0.95085 0.93466 False
8 [40000:44999] reference 0.944518 0.95085 0.93466 False
9 [45000:49999] reference 0.944430 0.95085 0.93466 False
10 [0:4999] analysis 0.943030 0.95085 0.93466 False
11 [5000:9999] analysis 0.941324 0.95085 0.93466 False
12 [10000:14999] analysis 0.943574 0.95085 0.93466 False
13 [15000:19999] analysis 0.943159 0.95085 0.93466 False
14 [20000:24999] analysis 0.944204 0.95085 0.93466 False
15 [25000:29999] analysis 0.911753 0.95085 0.93466 True
16 [30000:34999] analysis 0.911766 0.95085 0.93466 True
17 [35000:39999] analysis 0.911661 0.95085 0.93466 True
18 [40000:44999] analysis 0.913763 0.95085 0.93466 True
19 [45000:49999] analysis 0.914751 0.95085 0.93466 True
chunk f1
key period value upper_threshold lower_threshold alert
0 [0:4999] reference 0.942960 0.93 None True
1 [5000:9999] reference 0.940827 0.93 None True
2 [10000:14999] reference 0.943211 0.93 None True
3 [15000:19999] reference 0.942901 0.93 None True
4 [20000:24999] reference 0.943178 0.93 None True
5 [25000:29999] reference 0.942702 0.93 None True
6 [30000:34999] reference 0.940858 0.93 None True
7 [35000:39999] reference 0.944588 0.93 None True
8 [40000:44999] reference 0.944518 0.93 None True
9 [45000:49999] reference 0.944430 0.93 None True
10 [0:4999] analysis 0.943030 0.93 None True
11 [5000:9999] analysis 0.941324 0.93 None True
12 [10000:14999] analysis 0.943574 0.93 None True
13 [15000:19999] analysis 0.943159 0.93 None True
14 [20000:24999] analysis 0.944204 0.93 None True
15 [25000:29999] analysis 0.911753 0.93 None False
16 [30000:34999] analysis 0.911766 0.93 None False
17 [35000:39999] analysis 0.911661 0.93 None False
18 [40000:44999] analysis 0.913763 0.93 None False
19 [45000:49999] analysis 0.914751 0.93 None False
In [35]:
import json
import pandas as pd
import numpy as np
import requests
import zipfile
import io

import plotly.offline as py #working offline
import plotly.graph_objs as go

from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
In [38]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
py.init_notebook_mode()
In [ ]:
Load Data
In [37]:
content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
    raw_data = pd.read_csv(arc.open("day.csv"), header=0, sep=',', parse_dates=['dteday'], index_col='dteday')
In [39]:
#observe data structure
raw_data.head()
Out[39]:
instant season yr mnth holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
dteday
2011-01-01 1 1 0 1 0 6 0 2 0.344167 0.363625 0.805833 0.160446 331 654 985
2011-01-02 2 1 0 1 0 0 0 2 0.363478 0.353739 0.696087 0.248539 131 670 801
2011-01-03 3 1 0 1 0 1 1 1 0.196364 0.189405 0.437273 0.248309 120 1229 1349
2011-01-04 4 1 0 1 0 2 1 1 0.200000 0.212122 0.590435 0.160296 108 1454 1562
2011-01-05 5 1 0 1 0 3 1 1 0.226957 0.229270 0.436957 0.186900 82 1518 1600
In [40]:
#set column mapping for Evidently Profile
data_columns = ColumnMapping()
data_columns.numerical_features = ['weathersit', 'temp', 'atemp', 'hum', 'windspeed']
In [41]:
reference_dates = ('2011-01-01 00:00:00','2011-01-28 23:00:00')

#set experiment batches dates
experiment_batches = [
    ('2011-02-01 00:00:00','2011-02-28 23:00:00'),
    ('2011-03-01 00:00:00','2011-03-31 23:00:00'),
    ('2011-04-01 00:00:00','2011-04-30 23:00:00'),
    ('2011-05-01 00:00:00','2011-05-31 23:00:00'),  
    ('2011-06-01 00:00:00','2011-06-30 23:00:00'), 
    ('2011-07-01 00:00:00','2011-07-31 23:00:00'), 
]
In [42]:
#Functions to clculate drift with Evidently
data_drift_report = Report(metrics=[DataDriftPreset()])
data_drift_report.run(reference_data=raw_data[:100], current_data=raw_data[100:200], column_mapping=data_columns)
report = data_drift_report.as_dict()
In [43]:
report["metrics"][1]["result"]["drift_by_columns"]
Out[43]:
{'atemp': {'column_name': 'atemp',
  'column_type': 'num',
  'stattest_name': 'K-S p_value',
  'stattest_threshold': 0.05,
  'drift_score': 1.2170049699100436e-45,
  'drift_detected': True,
  'current': {'small_distribution': {'x': [0.321954,
     0.3655165,
     0.409079,
     0.45264150000000003,
     0.496204,
     0.5397665,
     0.583329,
     0.6268914999999999,
     0.670454,
     0.7140165,
     0.757579],
    'y': [0.22955523672883785,
     0.22955523672883785,
     1.6068866571018652,
     1.6068866571018672,
     3.4433285509325633,
     2.2955523672883813,
     3.9024390243902487,
     5.050215208034427,
     2.9842180774748885,
     1.6068866571018672]}},
  'reference': {'small_distribution': {'x': [0.0790696,
     0.12545554,
     0.17184148,
     0.21822742,
     0.26461336,
     0.31099930000000003,
     0.35738524,
     0.40377118,
     0.45015712,
     0.49654306000000004,
     0.542929],
    'y': [0.8623302664557407,
     1.5090779662975466,
     3.0181559325950915,
     5.389564165348378,
     3.0181559325950915,
     3.018155932595095,
     2.586990799367221,
     1.2934953996836105,
     0.2155825666139351,
     0.6467476998418061]}}},
 'hum': {'column_name': 'hum',
  'column_type': 'num',
  'stattest_name': 'K-S p_value',
  'stattest_threshold': 0.05,
  'drift_score': 0.00012016164291946694,
  'drift_detected': True,
  'current': {'small_distribution': {'x': [0.305,
     0.36674999999999996,
     0.4285,
     0.49024999999999996,
     0.5519999999999999,
     0.61375,
     0.6755,
     0.73725,
     0.7989999999999999,
     0.8607499999999999,
     0.9225],
    'y': [0.32388663967611353,
     0.3238866396761132,
     1.6194331983805677,
     1.2955465587044541,
     2.2672064777327905,
     3.4008097165991917,
     2.9149797570850216,
     1.943319838056681,
     1.1336032388663972,
     0.9716599190283388]}},
  'reference': {'small_distribution': {'x': [0.0,
     0.0948261,
     0.1896522,
     0.28447829999999996,
     0.3793044,
     0.4741305,
     0.5689565999999999,
     0.6637827,
     0.7586088,
     0.8534349,
     0.948261],
    'y': [0.1054561982407797,
     0.1054561982407797,
     0.0,
     0.632737189444678,
     2.003667766574814,
     2.741861154260275,
     1.8982115683340341,
     1.1600181806485765,
     1.0545619824077967,
     0.8436495859262374]}}},
 'temp': {'column_name': 'temp',
  'column_type': 'num',
  'stattest_name': 'K-S p_value',
  'stattest_threshold': 0.05,
  'drift_score': 1.2170049699100436e-45,
  'drift_detected': True,
  'current': {'small_distribution': {'x': [0.336667,
     0.3838336,
     0.4310002,
     0.4781668,
     0.5253334,
     0.5725,
     0.6196666,
     0.6668331999999999,
     0.7139998,
     0.7611664,
     0.808333],
    'y': [0.2120144339426628,
     0.6360433018279884,
     1.6961154715413025,
     1.6961154715413045,
     2.332158773369288,
     2.1201443394266306,
     3.1802165091399455,
     3.392230943082601,
     4.452303112795914,
     1.4841010375986414]}},
  'reference': {'small_distribution': {'x': [0.0591304,
     0.11055066,
     0.16197092,
     0.21339117999999999,
     0.26481144,
     0.3162317,
     0.36765196,
     0.41907222,
     0.47049248,
     0.52191274,
     0.573333],
    'y': [0.5834276217195324,
     0.972379369532554,
     3.889517478130216,
     3.500565730317193,
     3.500565730317194,
     3.111613982504173,
     2.139234612971619,
     0.972379369532554,
     0.3889517478130216,
     0.3889517478130216]}}},
 'weathersit': {'column_name': 'weathersit',
  'column_type': 'num',
  'stattest_name': 'chi-square p_value',
  'stattest_threshold': 0.05,
  'drift_score': 0.13673739978240018,
  'drift_detected': False,
  'current': {'small_distribution': {'x': [1.0,
     1.2,
     1.4,
     1.6,
     1.8,
     2.0,
     2.2,
     2.4000000000000004,
     2.6,
     2.8,
     3.0],
    'y': [3.3000000000000007,
     0.0,
     0.0,
     0.0,
     0.0,
     1.6499999999999986,
     0.0,
     0.0,
     0.0,
     0.049999999999999954]}},
  'reference': {'small_distribution': {'x': [1.0,
     1.2,
     1.4,
     1.6,
     1.8,
     2.0,
     2.2,
     2.4000000000000004,
     2.6,
     2.8,
     3.0],
    'y': [2.8500000000000005,
     0.0,
     0.0,
     0.0,
     0.0,
     1.9999999999999982,
     0.0,
     0.0,
     0.0,
     0.14999999999999988]}}},
 'windspeed': {'column_name': 'windspeed',
  'column_type': 'num',
  'stattest_name': 'K-S p_value',
  'stattest_threshold': 0.05,
  'drift_score': 0.03638428787491733,
  'drift_detected': True,
  'current': {'small_distribution': {'x': [0.0746375,
     0.10144045,
     0.1282434,
     0.15504635,
     0.1818493,
     0.20865225,
     0.2354552,
     0.26225815,
     0.2890611,
     0.31586405,
     0.342667],
    'y': [1.1192797807704,
     6.3425854243656,
     5.596398903852003,
     4.477119123081603,
     5.596398903851997,
     4.850212383338403,
     3.3578393423111983,
     1.492373041027201,
     1.865466301284001,
     2.6116528217976014]}},
  'reference': {'small_distribution': {'x': [0.0454083,
     0.09161377,
     0.13781924,
     0.18402470999999998,
     0.23023018,
     0.27643565,
     0.32264112,
     0.36884659000000003,
     0.41505206,
     0.46125752999999997,
     0.507463],
    'y': [0.8656983686130668,
     2.5970951058391996,
     3.029944290145736,
     6.709162356751263,
     4.1120672509120695,
     1.9478213293793993,
     1.514972145072866,
     0.4328491843065337,
     0.21642459215326684,
     0.21642459215326657]}}}}
In [44]:
#evaluate data drift with Evidently Profile
def detect_dataset_drift(reference, production, column_mapping, get_ratio=False):
    """
    Returns True if Data Drift is detected, else returns False.
    If get_ratio is True, returns the share of drifted features.
    The Data Drift detection depends on the confidence level and the threshold.
    For each individual feature Data Drift is detected with the selected confidence (default value is 0.95).
    Data Drift for the dataset is detected if share of the drifted features is above the selected threshold (default value is 0.5).
    """
    
    data_drift_report = Report(metrics=[DataDriftPreset()])
    data_drift_report.run(reference_data=reference, current_data=production, column_mapping=column_mapping)
    report = data_drift_report.as_dict()
    
    if get_ratio:
        return report["metrics"][0]["result"]["drift_share"]
    else:
        return report["metrics"][0]["result"]["dataset_drift"]
In [45]:
#evaluate data drift with Evidently Profile
def detect_features_drift(reference, production, column_mapping, get_scores=False):
    """
    Returns True if Data Drift is detected, else returns False. 
    If get_scores is True, returns scores value (like p-value) for each feature.
    The Data Drift detection depends on the confidence level and the threshold.
    For each individual feature Data Drift is detected with the selected confidence (default value is 0.95).
    """
    
    data_drift_report = Report(metrics=[DataDriftPreset()])
    data_drift_report.run(reference_data=reference, current_data=production, column_mapping=column_mapping)
    report = data_drift_report.as_dict()
    
    drifts = []
    num_features = column_mapping.numerical_features if column_mapping.numerical_features else []
    cat_features = column_mapping.categorical_features if column_mapping.categorical_features else []
    for feature in num_features + cat_features:
        drift_score = report["metrics"][1]["result"]["drift_by_columns"][feature]["drift_score"]
        if get_scores:
            drifts.append((feature, drift_score))
        else:
            drifts.append((feature, report["metrics"][1]["result"]["drift_by_columns"][feature]["drift_detected"]))
             
    return drifts
In [46]:
#Features Drift
features_historical_drift = []

for date in experiment_batches:
    drifts = detect_features_drift(raw_data.loc[reference_dates[0]:reference_dates[1]], 
                           raw_data.loc[date[0]:date[1]], 
                           column_mapping=data_columns)
    
    features_historical_drift.append([x[1] for x in drifts])
    
features_historical_drift_frame = pd.DataFrame(features_historical_drift, 
                                               columns = data_columns.numerical_features)
In [47]:
fig = go.Figure(data=go.Heatmap(
                   z = features_historical_drift_frame.astype(int).transpose(),
                   x = [x[1] for x in experiment_batches],
                   y = data_columns.numerical_features,
                   hoverongaps = False,
                   xgap = 1,
                   ygap = 1,
                   zmin = 0,
                   zmax = 1,
                   showscale = False,
                   colorscale = 'Bluered'
))

fig.update_xaxes(side="top")

fig.update_layout(
    xaxis_title = "Timestamp",
    yaxis_title = "Feature Drift"
)

fig.show()
In [48]:
features_historical_drift_pvalues = []

for date in experiment_batches:
    drifts = detect_features_drift(raw_data.loc[reference_dates[0]:reference_dates[1]], 
                           raw_data.loc[date[0]:date[1]],
                           column_mapping=data_columns,
                           get_scores=True)
    
    features_historical_drift_pvalues.append([x[1] for x in drifts])
    
features_historical_drift_pvalues_frame = pd.DataFrame(features_historical_drift_pvalues, 
                                                       columns = data_columns.numerical_features)
In [49]:
fig = go.Figure(data=go.Heatmap(
                   z = features_historical_drift_pvalues_frame.transpose(),
                   x = [x[1] for x in experiment_batches],
                   y = features_historical_drift_pvalues_frame.columns,
                   hoverongaps = False,
                   xgap = 1,
                   ygap = 1,
                   zmin = 0,
                   zmax = 1,
                   colorscale = 'reds_r'
                   )
               )

fig.update_xaxes(side="top")

fig.update_layout(
    xaxis_title = "Timestamp",
    yaxis_title = "p-value"
)

fig.show()
In [50]:
#Dataset Drift
dataset_historical_drift = []

for date in experiment_batches:
    dataset_historical_drift.append(detect_dataset_drift(raw_data.loc[reference_dates[0]:reference_dates[1]], 
                           raw_data.loc[date[0]:date[1]], 
                           column_mapping=data_columns))
In [51]:
fig = go.Figure(data=go.Heatmap(
                   z = [[1 if x == True else 0 for x in dataset_historical_drift]],
                   x = [x[1] for x in experiment_batches],
                   y = [''],
                   hoverongaps = False,
                   xgap = 1,
                   ygap = 1,
                   zmin = 0,
                   zmax = 1,
                   colorscale = 'Bluered',
                   showscale = False
                   )
               )

fig.update_xaxes(side="top")

fig.update_layout(
    xaxis_title = "Timestamp",
    yaxis_title = "Dataset Drift"
)
fig.show()
In [52]:
dataset_historical_drift_ratio = []

for date in experiment_batches:
    dataset_historical_drift_ratio.append(detect_dataset_drift(raw_data.loc[reference_dates[0]:reference_dates[1]], 
                           raw_data.loc[date[0]:date[1]],
                           column_mapping=data_columns,
                           get_ratio=True))
In [53]:
fig = go.Figure(data=go.Heatmap(
                   z = [dataset_historical_drift_ratio],
                   x = [x[1] for x in experiment_batches],
                   y = [''],
                   hoverongaps = False,
                   xgap = 1,
                   ygap = 1,
                   zmin = 0.5,
                   zmax = 1,
                   colorscale = 'reds'
                  )
               )

fig.update_xaxes(side="top")

fig.update_layout(
    xaxis_title = "Timestamp",
    yaxis_title = "Dataset Drift"
)
fig.show()
In [54]:
#Log Dataset Drift in MLFlow
#log into MLflow
client = MlflowClient()

#set experiment
mlflow.set_experiment('Dataset Drift Analysis with Evidently')

#start new run
for date in experiment_batches:
    with mlflow.start_run() as run: 
        
        # Log parameters
        mlflow.log_param("begin", date[0])
        mlflow.log_param("end", date[1])

        # Log metrics
        metric = detect_dataset_drift(raw_data.loc[reference_dates[0]:reference_dates[1]], 
                           raw_data.loc[date[0]:date[1]],
                           column_mapping=data_columns,
                           get_ratio=True)
        
        mlflow.log_metric('dataset drift', metric)

        print(run.info)
2023/03/30 17:16:18 INFO mlflow.tracking.fluent: Experiment with name 'Dataset Drift Analysis with Evidently' does not exist. Creating a new experiment.
2023/03/30 17:16:18 WARNING mlflow.utils.git_utils: Failed to import Git (the Git executable is probably not on your PATH), so Git SHA is not available. Error: Failed to initialize: Bad git executable.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

This initial warning can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|none|n|0: for no warning or exception
    - warn|w|warning|1: for a printed warning
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

<RunInfo: artifact_uri='file:///C:/Users/Prosenjeet%20Saha/mlruns/774057226463371695/419a47caf7f547d996bac3dc3f917f19/artifacts', end_time=None, experiment_id='774057226463371695', lifecycle_stage='active', run_id='419a47caf7f547d996bac3dc3f917f19', run_name='powerful-bat-149', run_uuid='419a47caf7f547d996bac3dc3f917f19', start_time=1680176778436, status='RUNNING', user_id='Prosenjeet Saha'>
<RunInfo: artifact_uri='file:///C:/Users/Prosenjeet%20Saha/mlruns/774057226463371695/37592e38c8384fd18fa223fe4512a58c/artifacts', end_time=None, experiment_id='774057226463371695', lifecycle_stage='active', run_id='37592e38c8384fd18fa223fe4512a58c', run_name='classy-fish-711', run_uuid='37592e38c8384fd18fa223fe4512a58c', start_time=1680176778781, status='RUNNING', user_id='Prosenjeet Saha'>
<RunInfo: artifact_uri='file:///C:/Users/Prosenjeet%20Saha/mlruns/774057226463371695/2083a82b28bf4bdcabf04123ee394fbc/artifacts', end_time=None, experiment_id='774057226463371695', lifecycle_stage='active', run_id='2083a82b28bf4bdcabf04123ee394fbc', run_name='sincere-owl-625', run_uuid='2083a82b28bf4bdcabf04123ee394fbc', start_time=1680176779109, status='RUNNING', user_id='Prosenjeet Saha'>
<RunInfo: artifact_uri='file:///C:/Users/Prosenjeet%20Saha/mlruns/774057226463371695/0d0261f7e38543349fa0b61a31849107/artifacts', end_time=None, experiment_id='774057226463371695', lifecycle_stage='active', run_id='0d0261f7e38543349fa0b61a31849107', run_name='loud-bear-292', run_uuid='0d0261f7e38543349fa0b61a31849107', start_time=1680176779434, status='RUNNING', user_id='Prosenjeet Saha'>
<RunInfo: artifact_uri='file:///C:/Users/Prosenjeet%20Saha/mlruns/774057226463371695/dddf54c6f48d4d1bae7c67aa544a95a4/artifacts', end_time=None, experiment_id='774057226463371695', lifecycle_stage='active', run_id='dddf54c6f48d4d1bae7c67aa544a95a4', run_name='unleashed-bear-899', run_uuid='dddf54c6f48d4d1bae7c67aa544a95a4', start_time=1680176779696, status='RUNNING', user_id='Prosenjeet Saha'>
<RunInfo: artifact_uri='file:///C:/Users/Prosenjeet%20Saha/mlruns/774057226463371695/0c39a880dca54ae6aef627d6c3a997ed/artifacts', end_time=None, experiment_id='774057226463371695', lifecycle_stage='active', run_id='0c39a880dca54ae6aef627d6c3a997ed', run_name='resilient-bat-38', run_uuid='0c39a880dca54ae6aef627d6c3a997ed', start_time=1680176780053, status='RUNNING', user_id='Prosenjeet Saha'>
In [56]:
#Kolmogorov-Smirnov Test
import numpy as np

# Rank the N random numbers
N = 30
# F(X) can be any continuous distribution,
# here I am using normal distribution
f_x = np.random.normal(size=N)
f_x_sorted = np.sort(f_x)

# Calculate max(i/N-Ri)
plus_max = list()
for i in range(1, N + 1):
	x = i / N - f_x_sorted[i-1]
	plus_max.append(x)
K_plus_max = np.sqrt(N) * np.max(plus_max)


# Calculate max(Ri-((i-1)/N))
minus_max = list()
for i in range(1, N + 1):
	y = (i-1)/N
	y = f_x_sorted[i-1]-y
	minus_max.append(y)
K_minus_max = np.sqrt(N) * np.max(minus_max)

# Calculate KS Statistic
K_max = max(K_plus_max, K_minus_max)
In [57]:
#General Steps
#The general steps to run the test are:

#Create an EDF for your sample data (see Empirical Distribution Function for steps),
#Specify a parent distribution (i.e. one that you want to compare your EDF to),
#Graph the two distributions together.
#Measure the greatest vertical distance between the two graphs.
#Calculate the test statistic.
#Find the critical value in the KS table.
#Compare to the critical value.
#Example 1: One Sample Kolmogorov-Smirnov Test
from numpy.random import seed
from numpy.random import poisson

#set seed (e.g. make this example reproducible)
seed(0)

#generate dataset of 100 values that follow a Poisson distribution with mean=5
data = poisson(5, 100)
In [58]:
from scipy.stats import kstest

#perform Kolmogorov-Smirnov test
kstest(data, 'norm')
Out[58]:
KstestResult(statistic=0.9072498680518208, pvalue=1.0908062873170287e-103, statistic_location=2, statistic_sign=-1)
In [59]:
#Example 2: Two Sample Kolmogorov-Smirnov Test
from numpy.random import seed
from numpy.random import randn
from numpy.random import lognormal

#set seed (e.g. make this example reproducible)
seed(0)

#generate two datasets
data1 = randn(100)
data2 = lognormal(3, 1, 100)
In [60]:
from scipy.stats import ks_2samp

#perform Kolmogorov-Smirnov test
ks_2samp(data1, data2)
Out[60]:
KstestResult(statistic=0.99, pvalue=4.417521386399011e-57, statistic_location=2.2697546239876076, statistic_sign=1)
In [61]:
#Cumulative Distribution Function (CDF)
def cdf(sample, x, sort = False):
    # Sorts the sample, if unsorted
    if sort:
        sample.sort()
    # Counts how many observations are below x
    cdf = sum(sample <= x)
    # Divides by the total number of observations
    cdf = cdf / len(sample)
    return cdf
In [62]:
#Assessing normality (1-sample test)
def ks_norm(sample):
    # Sorts the sample
    sample.sort()
    # Evaluates the KS statistic
    D_ks = [] # KS Statistic list
    for x in sample:
        cdf_normal = stats.norm.cdf(x = x, loc = 0, scale = 1)
        cdf_sample = cdf(sample = sample, x  = x)
        D_ks.append(abs(cdf_normal - cdf_sample))
    ks_stat = max(D_ks)
    # Calculates the P-Value based on the two-sided test
    # The P-Value comes from the KS Distribution Survival Function (SF = 1-CDF)
    p_value = stats.kstwo.sf(ks_stat, len(sample))
    return {"ks_stat": ks_stat, "p_value" : p_value}
In [64]:
# Create random samples
norm_a = np.random.normal(loc = 0, scale = 1, size = 500)
norm_b = np.random.normal(loc = 0.1, scale = 1, size = 500)
norm_c = np.random.normal(loc = 3, scale = 1, size = 500)
f_a = np.random.f(dfnum = 5, dfden  = 10, size = 500)
In [65]:
df = (norm_a, norm_b, norm_c, f_a)
In [71]:
sns.displot(df)
Out[71]:
<seaborn.axisgrid.FacetGrid at 0x183445a56a0>
In [78]:
# Create random samples
norm_a = np.random.normal(loc = 0, scale = 1, size = 500)
norm_b = np.random.normal(loc = 0.1, scale = 1, size = 500)
norm_c = np.random.normal(loc = 3, scale = 1, size = 500)
f_a = np.random.f(dfnum = 5, dfden  = 10, size = 500)
In [95]:
import matplotlib.pyplot as plt
x1 = norm_a
x2 = norm_b
x3 = norm_c
x4 = f_a

kwargs = dict(alpha=0.5, bins=100)

plt.hist(x1, **kwargs, color='g', label='norm_a')
plt.hist(x2, **kwargs, color='b', label='norm_b')
plt.hist(x3, **kwargs, color='r', label='norm_c')
plt.hist(x4, **kwargs, color='y', label='f_a')
plt.gca().set(title='Frequency Histogram', ylabel='count')
plt.xlim(50,75)
plt.legend()
plt.show()
In [ ]: